import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata1.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata2526.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata2627.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata2728.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata2829.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata2931.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata3101.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata0102.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata0203.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata0304.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata0407.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata0708.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata0810.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata1011.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
def count_ngrams(dataframe,column,begin_ngram,end_ngram):
# adapted from https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'].dropna())
frequencies = sum(sparse_matrix).toarray()[0]
most_common = pd.DataFrame(frequencies,
index=word_vectorizer.get_feature_names(),
columns=['frequency']).sort_values('frequency',ascending=False)
most_common['ngram'] = most_common.index
most_common.reset_index()
return most_common
def word_cloud_function(df,column,number_of_words):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
word_string=str(popular_words_nonstop)
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
max_words=number_of_words,
width=1000,height=1000,
).generate(word_string)
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def word_bar_graph_function(df,column,text):
# adapted from https://www.kaggle.com/benhamner/most-common-forum-topic-words
topic_words = [ z.lower() for y in
[ x.split() for x in df[column] if isinstance(x, str)]
for z in y]
word_count_dict = dict(Counter(topic_words))
popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])], color='red')
plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
plt.show()
df = pd.read_csv(r"/Users/archanaduraphe/twitterdata1112.csv")
three_gram = count_ngrams(df,'text',3,3)
two_gram = count_ngrams(df,'text',2,2)
words_to_exclude = ["my","to","at","for","it","the","with","from","would","there","or","if","it","but","of","in","as","and",'NaN','dtype','...','.','hai','ni','k','tu','ko','?']
import nltk
nltk.download('stopwords')
plt.figure(figsize=(10,10))
word_bar_graph_function(df,'text','Most common words in the text')
fig = px.bar(three_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 3-Words',
orientation='h')
fig.show()
fig = px.bar(two_gram.sort_values('frequency',ascending=False)[0:10],
x="frequency",
y="ngram",
title='Most Common 2-Words',
orientation='h')
fig.show()